import os
import pandas as pd
import numpy as np
import json
import matplotlib as plot
from matplotlib import pyplot as plt
import folium
import seaborn as sns
import requests
import warnings
warnings.filterwarnings(action="ignore")
# to change the os path
os.chdir("C:\A-Personal\AI2\CourseaProjects\DA-1\Class3-Week4")
df_Full = pd.read_csv("traffic-collision-data-from-2010-to-present.csv")
print("the rows and cols count are ",df_Full.shape)
df_Full.head(5)
2.1- To drop above columns from the data set
df_Full = df_Full.drop(
["Crime Code","Crime Code Description","Census Tracts","Precinct Boundaries","LA Specific Plans",
"Council Districts","Neighborhood Councils (Certified)","Zip Codes"],axis=1)
2.2- To pring the columns again after deletion
print(df_Full.columns)
# reset the column names
df_Full.columns = ['DR_Number', 'Date_Reported', 'Date_Occurred', 'Time_Occurred',
'Area_ID', 'Area_Name', 'Reporting_District', 'MO_Codes', 'Victim_Age',
'Victim_Sex', 'Victim_Descent', 'Premise_Code', 'Premise_Description',
'Address', 'Cross_Street', 'Location']
df_Full.head(1)
2.3- Add columns for datetime split and show the result
df_Full["Report_Year"] = df_Full["Date_Reported"].str[:4]
df_Full["Report_Month"] = df_Full["Date_Reported"].str[5:7]
df_Full["Report_Day"] = df_Full["Date_Reported"].str[8:10]
df_Full["Occur_Year"] = df_Full["Date_Occurred"].str[:4]
df_Full["Occur_Month"] = df_Full["Date_Occurred"].str[5:7]
df_Full["Occur_Day"] = df_Full["Date_Occurred"].str[8:10]
df_Full["Date_Reported"] = pd.to_datetime(df_Full["Date_Reported"])
df_Full["Date_Occurred"] = pd.to_datetime(df_Full["Date_Occurred"])
df_Full["WeekDay"] = df_Full.Date_Occurred.dt.dayofweek
# classification the time period
# To seperate the timing to 8 group:
# - Mid night(0000-0300] : 1
# - Early morning(0300-0600] : 2
# - Morning(0600-0900] : 3
# - Nearly noon(0900-1200] : 4
# - Late noon(1200-0300] : 5
# - Afternoon(0300-0600] : 6
# - Evening(0600-0900] : 7
# - Early Night(0900-0000] : 8
def ConvertClass(value):
classID = {300:1,600:2,900:3,1200:4,1500:5,1800:6,2100:7,2400:8}
for key,classValue in classID.items():
if value < key :
return classValue
df_Full["TimingClass"] = df_Full["Time_Occurred"].apply(ConvertClass)
df_Full[["TimingClass","Time_Occurred"]].head()
df_Full[["Date_Reported","Date_Occurred","Report_Year","Report_Month","Report_Day","TimingClass"]].head()
2.4- To setup the longtitue and latitue information seperatly from location
# To get the longtitue and latutude from Location field
def GetLocation(value):
result = value.split(',')[0]+','+value.split(',')[1]+'}'
return json.loads(result.replace("'",'"'))
df_Full["Longtitue"] = df_Full["Location"].apply(lambda x: GetLocation(x)["longitude"])
df_Full["Latitude"] = df_Full["Location"].apply(lambda x: GetLocation(x)["latitude"])
df_Full[["Location","Longtitue","Latitude"]].head()
# After data conversion, delete location field
df_Full = df_Full.drop(["Location"],axis=1)
The value of sex and descent are all short name, update the value as full name which will be more convenience for data display in further
The sex code:
base on Victim Descent Code:
And for vinctim age, to classificate the value by 10 year as one bin
2.5- To update values for victim sex and descent
# To convert the value for Victim Sex
def ConverValueForSex(value):
if(value=="F"):
return "Female"
elif(value=="M"):
return "Male"
else:
return value
df_Full["Victim_Sex"] = df_Full["Victim_Sex"].apply(ConverValueForSex)
df_Full["Victim_Sex"].head()
# to do conversion
def ConversionDescentCode(value):
dictDescent = {"A":"Other Asian","B":"Black","C":"Chinese","D":"Cambodian","F":"Filipino","G":"Guamanian",
"H":"Hispanic/Latin/Mexican","I":"American Indian/Alaskan Native",
"J":"Japanese","K":"Korean","L":"Laotian","O":"Other","P":"Pacific Islander",
"S":"Samoan","U":"Hawaiian","V":"Vietnamese","W":"White","X":"Unknown","Z":"Asian Indian"
}
for key,descent in dictDescent.items():
if(value != key):
continue
return dictDescent[value]
df_Full["Victim_Descent"] = df_Full["Victim_Descent"].apply(ConversionDescentCode)
df_Full["Victim_Descent"].head()
2.6- Classificate the victim age by 10 year as one bin
# To add the classification for the ages
def ConvertAge(value):
dictAge = {
20:"10-20",30:"20-30",40:"30-40",50:"40-50",60:"50-60",70:"60-70",80:"70-80",
90:"80-90",100:"90-100"
}
for age,group in dictAge.items():
if value <= age:
return group
df_Full["Victim_Age_Group"] = df_Full["Victim_Age"].apply(ConvertAge)
df_Full[["Victim_Age","Victim_Age_Group"]].head()
2.7- Get the major MO code from original value, and no change is it only has one code originally
# get the major Mo codes for the cases
def ConvertMOCode(value):
moCodeList = str(value).split(' ')
moCodeList.sort()
result = None
temp = None
for item in moCodeList:
if item > "3023":
temp = item
continue
result = item
if(result==None) and (temp!=None):
result = temp
return result
df_Full["MajorMoCode"] = df_Full["MO_Codes"].apply(ConvertMOCode)
df_Full[df_Full["MO_Codes"].isna()==False][["MO_Codes","MajorMoCode"]].head()
2.8- To get the columns which contains NaN values
df_tmp = df_Full.isna().sum().sort_values(ascending=False).to_frame()
df_tmp[df_tmp[0]>0]
2.9- Remove the records
# The premise code and its Description seems only with 25 rows which is very tiny size of total data
# so drop the records directly
df_Full.drop(df_Full[df_Full["Premise_Code"].isna()].index,inplace=True)
df_Full.drop(df_Full[df_Full["Premise_Description"].isna()].index,inplace=True)
# to check again after deletion
print("The total rows of Premise Code with NaN:" , df_Full[df_Full["Premise_Code"].isna()].shape[0])
print("The total rows of Premise Description with NaN:" , df_Full[df_Full["Premise_Description"].isna()].shape[0])
3.1- Accident trend per year
df_TotalPerYear = df_Full.groupby("Occur_Year").count()[["DR_Number"]].reset_index(["Occur_Year"])
f, ax = plt.subplots(figsize=(14,8))
sns.set(style="whitegrid")
showPlot = sns.barplot(data=df_TotalPerYear,x="Occur_Year",y="DR_Number",color="lightblue")
for index,values in df_TotalPerYear.iterrows():
showPlot.text(values.name,values["DR_Number"],values["DR_Number"],color="black",ha="center")
plt.show()
3.1.end- The information from above plot:
3.2- Accident trend per month under each year
# Get all records expect 2019 data
df_TotalPerMonthUnderYear = df_Full.query("Occur_Year!='2019'")
# group data
df_TotalPerMonthUnderYear = df_TotalPerMonthUnderYear.groupby(["Occur_Month","Occur_Year"]).count()[["DR_Number"]]
# Get the dataframe
df_TotalPerMonthUnderYear = df_TotalPerMonthUnderYear.reset_index()
# trend for all months
f,ax = plt.subplots(figsize=(15,5))
g2 = sns.lineplot(data=df_TotalPerMonthUnderYear,x="Occur_Month",y="DR_Number",palette="tab20c")
g2.set_title("Total accident per month for whole")
plt.show()
# trend for all months for each year
grid = sns.FacetGrid(data=df_TotalPerMonthUnderYear,col="Occur_Year",hue="Occur_Year",palette="tab20c")
grid.map(plt.plot,"Occur_Month","DR_Number",marker="o")
grid.fig.set_figheight(6)
grid.fig.set_figwidth(15)
f,ax = plt.subplots(figsize=(15,8))
g = sns.pointplot(data=df_TotalPerMonthUnderYear,x="Occur_Month",y="DR_Number",hue="Occur_Year",palette="tab20c")
g.set_title("Total accident per month under each year")
plt.show()
3.2.end- The information from above plot:
3.3- Accident trend per week day under each month
# data
df_PerHourOnEachMonth = df_Full.query("Occur_Year != '2019'")
df_PerHourOnEachMonth = df_PerHourOnEachMonth.groupby(["WeekDay","Occur_Month"]).count()["DR_Number"]
df_PerHourOnEachMonth = df_PerHourOnEachMonth.reset_index()
# facet
grid = sns.FacetGrid(col="Occur_Month",hue="Occur_Month",col_wrap=4,height=1.5,
data=df_PerHourOnEachMonth,palette="tab20c")
grid.map(plt.plot,"WeekDay","DR_Number",marker="o")
grid.fig.set_figheight(4)
grid.fig.set_figwidth(20)
f,ax = plt.subplots(figsize=(15,8))
g_perDay = sns.pointplot(data=df_PerHourOnEachMonth,x=df_PerHourOnEachMonth.WeekDay,y=df_PerHourOnEachMonth.DR_Number,
hue=df_PerHourOnEachMonth.Occur_Month,palette="tab20c")
g_perDay.set_title("Per WeekDay EachMonth")
plt.show()
3.3.end- The information from above plots:
3.4- Accident trend per hours under each week day
# data
df_PerHourOnEachMonth = df_Full.query("Occur_Year != '2019'")
df_PerHourOnEachMonth = df_PerHourOnEachMonth.groupby(["TimingClass","WeekDay"]).count()["DR_Number"]
df_PerHourOnEachMonth = df_PerHourOnEachMonth.reset_index()
# facet
grid = sns.FacetGrid(df_PerHourOnEachMonth, col="WeekDay", hue="WeekDay", palette="tab20c",
col_wrap=7, height=2)
grid.map(plt.plot, "TimingClass", "DR_Number", marker="o")
grid.fig.set_figheight(4)
grid.fig.set_figwidth(20)
# combination
f,ax = plt.subplots(figsize=(15,8))
g_perDay = sns.barplot(data=df_PerHourOnEachMonth,x=df_PerHourOnEachMonth.TimingClass,y=df_PerHourOnEachMonth.DR_Number,
hue=df_PerHourOnEachMonth.WeekDay,palette="tab20c")
g_perDay.set_title("Per TimeZone EachMonth")
plt.show()
3.4.end- The information from above plots:
3.5- Accident trend on different sex
df_BySex_PerYear = df_Full.query("Occur_Year != '2019' and Victim_Sex in ('Male','Female')").groupby(
["Victim_Sex","Occur_Month","Occur_Year"]).count()["DR_Number"]
df_BySex_PerYear = df_BySex_PerYear.reset_index()
grid = sns.FacetGrid(data=df_BySex_PerYear,col="Occur_Year",hue="Victim_Sex",palette="YlGnBu",col_wrap=3)
grid.map(plt.plot,"Occur_Month","DR_Number",marker='o').add_legend()
grid.fig.set_figheight(4)
grid.fig.set_figwidth(20)
3.5.end- The information from above plots:
- Male victim are always much more then Female
- it's same trend for male/female along with the total accident per years
3.6- Accident heat map on different age
df_ByAge_PerYear = df_Full.query("Occur_Year != '2019'").groupby(
["Victim_Age_Group","Occur_Year"]).count()["DR_Number"]
df_ByAge_PerYear = df_ByAge_PerYear.reset_index()
df_ByAge_PerYear = df_ByAge_PerYear.pivot("Victim_Age_Group","Occur_Year","DR_Number")
f,ax = plt.subplots(figsize=(15,8))
ax = sns.heatmap(df_ByAge_PerYear,annot=True, fmt="d",cmap="YlGnBu",linewidths=.5)
df_ByAge_PerYear = df_Full.query("Occur_Year != '2019' and Victim_Age > 20 and Victim_Age < 50 ").groupby(
["Victim_Age","Occur_Year"]).count()["DR_Number"]
df_ByAge_PerYear = df_ByAge_PerYear.reset_index()
df_ByAge_PerYear = df_ByAge_PerYear.pivot("Victim_Age","Occur_Year","DR_Number")
f,ax = plt.subplots(figsize=(15,8))
ax = sns.heatmap(df_ByAge_PerYear,annot=True, fmt="d",cmap="YlGnBu",linewidths=.5)
3.6.end- The information from above plots:
- Most victim are in age from 20-60, and 20-30 is the highest throng
- when zoom in the range from 20-60, the higest throngs are age on 25 and 30
3.7- Accident heat map on different race
df_ByDescent_PerYear = df_Full.query("Occur_Year != '2019'").groupby(
["Victim_Descent","Occur_Year"]).count()["DR_Number"]
df_ByDescent_PerYear = df_ByDescent_PerYear.reset_index()
df_ByDescent_PerYear = df_ByDescent_PerYear.pivot("Occur_Year","Victim_Descent","DR_Number")
df_ByDescent_PerYear = df_ByDescent_PerYear.replace(np.nan,0)
df_ByDescent_PerYear = df_ByDescent_PerYear.astype("int64")
f,ax = plt.subplots(figsize=(15,8))
sns.heatmap(data=df_ByDescent_PerYear,annot=True,fmt="d",cmap="YlGnBu",linewidths=.5)
df_ByDescent_PerYear = df_Full.query("Occur_Year != '2019' and Victim_Age > 20 and Victim_Age < 50 ").groupby(
["Victim_Descent","Victim_Age"]).count()["DR_Number"]
df_ByDescent_PerYear = df_ByDescent_PerYear.reset_index()
df_ByDescent_PerYear = df_ByDescent_PerYear.pivot("Victim_Age","Victim_Descent","DR_Number")
df_ByDescent_PerYear = df_ByDescent_PerYear.replace(np.nan,0)
df_ByDescent_PerYear = df_ByDescent_PerYear.astype("int64")
f,ax = plt.subplots(figsize=(15,8))
sns.heatmap(data=df_ByDescent_PerYear,annot=True,fmt="d",cmap="YlGnBu",linewidths=.5)
3.7.end- The information from above plots:
3.8- The location points with all years except 2019
df_ByConditions_Location = df_Full.query(
"Occur_Year != '2019'\
and Victim_Age > 20 and Victim_Age < 50 \
and TimingClass in (1,5,6,7)")
df_ByConditions_Location["Longtitue"] = df_ByConditions_Location["Longtitue"].astype("float64")
df_ByConditions_Location["Latitude"] = df_ByConditions_Location["Latitude"].astype("float64")
print("The total of records is ", df_ByConditions_Location.count()["DR_Number"])
df_temp = df_ByConditions_Location.groupby(["Area_ID","Area_Name","Longtitue","Latitude"]).count()["DR_Number"].reset_index()
df_temp = df_temp.sort_values("DR_Number",ascending=False)
df_temp = df_temp.drop(df_temp[df_temp["DR_Number"]<=10].index)
print("The total of records on high frequency location",df_temp.count()["DR_Number"])
map_clusters = folium.Map(location=[34.087994, -118.179010], zoom_start=10)
for lat, lon,label in zip(df_temp['Latitude'], df_temp['Longtitue'],df_temp["Area_Name"]):
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
fill=True,
fill_opacity=0.7).add_to(map_clusters)
map_clusters
3.9- The location points with all years except 2019 and all week day 4
df_ByConditions_Location = df_Full.query(
"Occur_Year != '2019'\
and WeekDay == 4 \
and Victim_Age > 20 and Victim_Age < 50 \
and TimingClass in (1,5,6,7)")
df_ByConditions_Location["Longtitue"] = df_ByConditions_Location["Longtitue"].astype("float64")
df_ByConditions_Location["Latitude"] = df_ByConditions_Location["Latitude"].astype("float64")
print("The total of records is ", df_ByConditions_Location.count()["DR_Number"])
df_temp = df_ByConditions_Location.groupby(["Area_ID","Area_Name","Longtitue","Latitude"]).count()["DR_Number"].reset_index()
df_temp = df_temp.sort_values("DR_Number",ascending=False)
df_temp = df_temp.drop(df_temp[df_temp["DR_Number"]<=10].index)
print("The total of records on high frequency location",df_temp.count()["DR_Number"])
map_clusters = folium.Map(location=[34.087994, -118.179010], zoom_start=10)
for lat, lon,label in zip(df_temp['Latitude'], df_temp['Longtitue'],df_temp["Area_Name"]):
folium.CircleMarker(
[lat, lon],
radius=5,
popup=label,
fill=True,
fill_opacity=0.7).add_to(map_clusters)
map_clusters
3.8&3.9.end - The information from above plots:
4.1- Prepare foursquare URL and data extraction functions
# init the information about the certification
CLIENT_ID = 'A3EBICYRNY3NN5DCUIPZTUPWHRMM5EWDZPXOZW4PIDZH0BKD'
CLIENT_SECRET = 'MA3U0L22R0YELKNWSTDRGQ3ZXJQRPJKZKXSBX0ZVHBIRUOA1'
VERSION = '20190709'
LIMIT = 100
radius = 100
neighborhood_latitude = '43.65426'
neighborhood_longitude = '-79.360636'
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
neighborhood_latitude,
neighborhood_longitude,
radius,
LIMIT)
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)
print('The URL:' + url)
# function to get the information near by the location
def getNearbyVenues(names, latitudes, longitudes, radius=500):
venues_list=[]
for name, lat, lng in zip(names, latitudes, longitudes):
# create the API request URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},\
{}&radius={}&limit={}'.format(
CLIENT_ID,
CLIENT_SECRET,
VERSION,
lat,
lng,
radius,
LIMIT)
# make the GET request
results = requests.get(url).json()["response"]['groups'][0]['items']
# return only relevant information for each nearby venue
venues_list.append([(
name,
lat,
lng,
v['venue']['name'],
v['venue']['location']['lat'],
v['venue']['location']['lng'],
v['venue']['categories'][0]['name']) for v in results])
nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
nearby_venues.columns = ['Neighborhood',
'Neighborhood Latitude',
'Neighborhood Longitude',
'Venue',
'Venue Latitude',
'Venue Longitude',
'Venue Category']
return(nearby_venues)
4.2- Prepare High risk cases
# the df_temp data set the is data from 3.9 plot, and get near by information from foursquare base on lat&lgt
locationNearByInfo = getNearbyVenues(df_temp["Area_Name"],df_temp["Latitude"],df_temp["Longtitue"])
print("Shape of data set is:",locationNearByInfo.shape)
locationNearByInfo.head()
# to use the Venue information as features for a location
locationNearByInfo.groupby("Venue Category").count().sort_values(by="Neighborhood",ascending=False).head()
# To combine similar Venues by key words of Venue category
def VenueConversion(x):
list_Class = ["Store","Shop","Bar","Restaurant","Place","Bank","Café","Hotel","Pharmacy","Bakery",
"Supermarket","Gym","Studio","Joint","Lounge","Club","Mall","Food","Venue",
"Park","Spot","Station","Theater","pub","Spa","Boutique"]
for item in list_Class:
if x.lower().find(item.lower())>=0:
return item
return "Others"
4.2.1- Data conversion on High risk data
# to Add new columns in the data set after VenueConversion function washed
df_temp_convert = locationNearByInfo.copy()
df_temp_convert["Vcat"] = df_temp_convert["Venue Category"].apply(VenueConversion)
df_temp_convert.head()
# Turn all Vcat information as a columns for the data set and combin with original Longtitue&Latitude of location
locationNearByInfo_GetDummy = pd.get_dummies(df_temp_convert[["Vcat"]],prefix="",prefix_sep="")
locationNearByInfo_GetDummy["Neighborhood"] = locationNearByInfo["Neighborhood"]
locationNearByInfo_GetDummy["Latitude"] = locationNearByInfo["Neighborhood Latitude"]
locationNearByInfo_GetDummy["Longitude"] = locationNearByInfo["Neighborhood Longitude"]
locationNearByInfo_GetDummy = locationNearByInfo_GetDummy.groupby(["Latitude","Longitude"]).sum().reset_index()
locationNearByInfo_HighRisk = pd.merge(df_temp,locationNearByInfo_GetDummy,how="inner"
,left_on=["Longtitue","Latitude"],
right_on=["Longitude","Latitude"])
print(locationNearByInfo_HighRisk.shape)
locationNearByInfo_HighRisk.head()
4.3- Prepare low risk cases
# to prepare the low risk data, which same conditions with High cases except only get the location occurred
# accident <= 10 on over 9 years
df_ByConditions_Location = df_Full.query(
"Occur_Year != '2019'\
and WeekDay == 4 \
and Victim_Age > 20 and Victim_Age < 50 \
and TimingClass in (1,5,6,7)")
df_ByConditions_Location["Longtitue"] = df_ByConditions_Location["Longtitue"].astype("float64")
df_ByConditions_Location["Latitude"] = df_ByConditions_Location["Latitude"].astype("float64")
print("The total of records is ", df_ByConditions_Location.count()["DR_Number"])
df_temp_lowCases = df_ByConditions_Location.groupby(["Area_ID","Area_Name","Longtitue","Latitude"]).count()["DR_Number"].reset_index()
df_temp_lowCases = df_temp_lowCases.sort_values("DR_Number",ascending=False)
df_temp_lowCases = df_temp_lowCases.drop(df_temp_lowCases[df_temp_lowCases["DR_Number"]>10].index)
print("The total of records on high frequency location",df_temp_lowCases.count()["DR_Number"])
df_Final_lowCases = df_temp_lowCases.sort_values(by="DR_Number").query("DR_Number==1").sample(frac=0.024475)
print("The low risk localtion for test is total: ",df_Final_lowCases.shape)
df_Final_lowCases.head()
# get near by information from foursquare base on lat&lgt
locationNearByInfo_LowRiskCases = getNearbyVenues(df_Final_lowCases["Area_Name"]
,df_Final_lowCases["Latitude"],df_Final_lowCases["Longtitue"])
print("Shape of data set is:",locationNearByInfo_LowRiskCases.shape)
print(locationNearByInfo_LowRiskCases.head())
locationNearByInfo_LowRiskCases.groupby("Venue Category").count().sort_values(by="Neighborhood",ascending=False).head()
4.3.1- Data conversion on Low risk data
df_temp_convert_lower = locationNearByInfo_LowRiskCases.copy()
df_temp_convert_lower["Vcat"] = df_temp_convert_lower["Venue Category"].apply(VenueConversion)
df_temp_convert_lower.head()
locationNearByInfo_LowGetDummy = pd.get_dummies(df_temp_convert_lower[["Vcat"]],prefix="",prefix_sep="")
locationNearByInfo_LowGetDummy["Neighborhood"] = locationNearByInfo_LowRiskCases["Neighborhood"]
locationNearByInfo_LowGetDummy["Latitude"] = locationNearByInfo_LowRiskCases["Neighborhood Latitude"]
locationNearByInfo_LowGetDummy["Longitude"] = locationNearByInfo_LowRiskCases["Neighborhood Longitude"]
locationNearByInfo_LowGetDummy = locationNearByInfo_LowGetDummy.groupby(["Latitude","Longitude"]).sum().reset_index()
locationNearByInfo_LowRisk = pd.merge(df_temp_lowCases,locationNearByInfo_LowGetDummy,how="inner"
,left_on=["Longtitue","Latitude"]
,right_on=["Longitude","Latitude"])
print(locationNearByInfo_LowRisk.shape)
locationNearByInfo_LowRisk.head()
4.4- Build the modle
# now we have the high risk locations and low risk location data set
# locationNearByInfo_LowRisk
# locationNearByInfo_HighRisk
# now add lable for them
locationNearByInfo_HighRisk["Risk"] = "High"
locationNearByInfo_LowRisk["Risk"] = "Low"
# combine the dataset
locationNearByInfo_All = pd.concat([locationNearByInfo_HighRisk,locationNearByInfo_LowRisk]
,join_axes=[locationNearByInfo_LowRisk.columns])
#replace NaN to 0
locationNearByInfo_All = locationNearByInfo_All.replace(np.nan,0)
# remove longtitue and Latitude
locationNearByInfo_All.pop("Longitude")
locationNearByInfo_All.pop("Latitude")
locationNearByInfo_All.pop("Area_Name")
locationNearByInfo_All.pop("Area_ID")
locationNearByInfo_All.pop("Longtitue")
locationNearByInfo_All.pop("DR_Number")
print("The shape of the whole data is :",locationNearByInfo_All.shape)
locationNearByInfo_All.head()
# create x data set and y data set
y = locationNearByInfo_All["Risk"]
df_test = locationNearByInfo_All.copy()
df_test.pop("Risk")
x = df_test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=4)
print ('Train set:', x_train.shape, y_train.shape)
print ('Test set:', x_test.shape, y_test.shape)
# get the accuracy
from sklearn.linear_model import LogisticRegression
import sklearn.metrics
model = LogisticRegression(C=0.01, solver='liblinear').fit(x_train,y_train)
y_hat = model.predict(x_test)
sklearn.metrics.accuracy_score(y_test,y_hat)
4.5- Model verification
df_ByConditions_Location_verfication = df_Full.copy()
df_ByConditions_Location_verfication["Longtitue"] = df_ByConditions_Location_verfication["Longtitue"].astype("float64")
df_ByConditions_Location_verfication["Latitude"] = df_ByConditions_Location_verfication["Latitude"].astype("float64")
print("The total of records is ", df_ByConditions_Location_verfication.count()["DR_Number"])
df_temp_verfication = df_ByConditions_Location_verfication.groupby(["Area_ID"
,"Area_Name"
,"Longtitue"
,"Latitude"]).count()["DR_Number"].reset_index()
df_temp_verfication = df_temp_verfication.sort_values("DR_Number",ascending=False)
# df_temp_verfication = df_temp_verfication.drop(df_temp_verfication[df_temp_verfication["DR_Number"]>3].index)
print("The total of records on high frequency location",df_temp_verfication.count()["DR_Number"])
df_temp_verfication = df_temp_verfication.sample(frac=0.001)
df_temp_verfication.head()
# get features from foursquare
location_verification = getNearbyVenues(df_temp_verfication["Area_Name"]
,df_temp_verfication["Latitude"],df_temp_verfication["Longtitue"])
location_verification.head()
4.5.1- Data conversion for verification data
df_temp_convert_verify = location_verification.copy()
df_temp_convert_verify["Vcat"] = df_temp_convert_verify["Venue Category"].apply(VenueConversion)
df_temp_convert_verify.head()
location_verification_final = pd.get_dummies(df_temp_convert_verify["Vcat"])
location_verification_final["Neighborhood"] = location_verification["Neighborhood"]
location_verification_final["Latitude"] = location_verification["Neighborhood Latitude"]
location_verification_final["Longitude"] = location_verification["Neighborhood Longitude"]
location_verification_final = location_verification_final.groupby(["Latitude","Longitude"]).sum().reset_index()
location_verification_final_step2 = pd.merge(df_temp_verfication,location_verification_final,how="inner"
,left_on=["Longtitue","Latitude"]
,right_on=["Longitude","Latitude"])
location_verification_final_step3 = location_verification_final_step2.copy()
location_verification_final_step3.pop("Longitude")
location_verification_final_step3.pop("Latitude")
location_verification_final_step3.pop("Area_Name")
location_verification_final_step3.pop("Area_ID")
location_verification_final_step3.pop("Longtitue")
location_verification_final_step3.pop("DR_Number")
location_verification_final_step3.head()
# Model data
model_data = locationNearByInfo_All.head(1).rename(index={0:-1})
location_verification_final_step4 = pd.concat([model_data,location_verification_final_step3]
,join_axes=[model_data.columns])
location_verification_final_step4 = location_verification_final_step4.replace(np.nan,0)
location_verification_final_step4 = location_verification_final_step4.drop([-1])
location_verification_final_step4.pop("Risk")
location_verification_final_step4.head()
y_hat = model.predict(location_verification_final_step4)
location_verification_final_step2["Risk"] = y_hat
location_verification_final_step2["ActualRisk"] = location_verification_final_step2["DR_Number"].apply(
lambda x : 'High' if x>10 else 'Low'
)
# get the accuracy
y_actual = location_verification_final_step2["ActualRisk"].apply(lambda x : 1 if x=='High' else -1)
y_hat = location_verification_final_step2["Risk"].apply(lambda x : 1 if x=='High' else -1)
sklearn.metrics.accuracy_score(y_actual,y_hat)